In [ ]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn import metrics
import shap
import lime
import lime.lime_tabular
import pickle
import warnings
warnings.filterwarnings('ignore')
shap.initjs()
No description has been provided for this image
In [ ]:
data_raw = pd.read_stata('MIMIC-SAD_dta_files/MIMIC-IV.dta')
data = pd.read_stata('MIMIC-SAD_dta_files/MIMIC-IV.dta')
dummies = pd.get_dummies(data['race'])
data = data.drop('race',axis=1).join(dummies)
dummies = pd.get_dummies(data['first_careunit'])
data = data.drop('first_careunit',axis=1).join(dummies)
In [ ]:
x=3
data.iloc[:,0+x*18:18+x*18]
data
Out[ ]:
stay_id age weight gender temperature heart_rate resp_rate spo2 sbp dbp ... OTHER WHITE unknown CCU CVICU MICU MICU/SICU NICU SICU TSICU
0 30000646 44.0 79.000000 0 37.000000 100.0 28.0 98.0 107.0 66.0 ... 0 0 0 1 0 0 0 0 0 0
1 30001446 56.0 119.300003 0 36.720001 82.0 22.0 90.0 75.0 56.0 ... 0 1 0 0 0 1 0 0 0 0
2 30002415 73.0 83.500000 1 36.439999 71.0 16.0 100.0 117.0 67.0 ... 0 1 0 0 1 0 0 0 0 0
3 30003226 67.0 93.449997 0 37.220001 89.0 17.0 98.0 111.0 63.0 ... 0 0 0 0 0 0 0 0 1 0
4 30004242 76.0 77.599998 1 36.720001 59.0 21.0 97.0 107.0 90.0 ... 0 0 0 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14615 39993425 93.0 47.799999 1 35.830002 115.0 16.0 99.0 97.0 71.0 ... 0 1 0 0 0 1 0 0 0 0
14616 39993476 67.0 93.000000 0 36.439999 81.0 16.0 100.0 112.0 60.0 ... 0 1 0 0 1 0 0 0 0 0
14617 39993968 91.0 57.500000 1 35.830002 43.0 17.0 100.0 78.0 39.0 ... 0 1 0 1 0 0 0 0 0 0
14618 39996044 59.0 66.400002 0 36.389999 105.0 23.0 100.0 107.0 63.0 ... 0 1 0 0 0 1 0 0 0 0
14619 39999301 78.0 107.699997 0 36.610001 58.0 15.0 96.0 108.0 62.0 ... 0 0 0 0 1 0 0 0 0 0

14620 rows × 61 columns

In [ ]:
data2 = data.drop(['deliriumtime', 'hosp_mort', 'icu28dmort', 'stay_id', 'icustay', 'hospstay', 'sepsistime'], axis=1)
In [ ]:
# Compute the correlation matrix
corr = data_raw.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 16))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
Out[ ]:
<Axes: >
No description has been provided for this image
In [ ]:
x=2
data2.iloc[:,0+x*18:18+x*18]
data2
Out[ ]:
age weight gender temperature heart_rate resp_rate spo2 sbp dbp mbp ... OTHER WHITE unknown CCU CVICU MICU MICU/SICU NICU SICU TSICU
0 44.0 79.000000 0 37.000000 100.0 28.0 98.0 107.0 66.0 75.0 ... 0 0 0 1 0 0 0 0 0 0
1 56.0 119.300003 0 36.720001 82.0 22.0 90.0 75.0 56.0 61.0 ... 0 1 0 0 0 1 0 0 0 0
2 73.0 83.500000 1 36.439999 71.0 16.0 100.0 117.0 67.0 87.0 ... 0 1 0 0 1 0 0 0 0 0
3 67.0 93.449997 0 37.220001 89.0 17.0 98.0 111.0 63.0 71.0 ... 0 0 0 0 0 0 0 0 1 0
4 76.0 77.599998 1 36.720001 59.0 21.0 97.0 107.0 90.0 94.0 ... 0 0 0 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14615 93.0 47.799999 1 35.830002 115.0 16.0 99.0 97.0 71.0 80.0 ... 0 1 0 0 0 1 0 0 0 0
14616 67.0 93.000000 0 36.439999 81.0 16.0 100.0 112.0 60.0 78.0 ... 0 1 0 0 1 0 0 0 0 0
14617 91.0 57.500000 1 35.830002 43.0 17.0 100.0 78.0 39.0 49.0 ... 0 1 0 1 0 0 0 0 0 0
14618 59.0 66.400002 0 36.389999 105.0 23.0 100.0 107.0 63.0 80.0 ... 0 1 0 0 0 1 0 0 0 0
14619 78.0 107.699997 0 36.610001 58.0 15.0 96.0 108.0 62.0 73.0 ... 0 0 0 0 1 0 0 0 0 0

14620 rows × 54 columns

In [ ]:
np.random.seed(123)
index = np.random.rand(len(data2)) < 0.7
train = data2[index]
test = data2[~index]
In [ ]:
test.to_stata('test_set.dta')
In [ ]:
train_raw = data_raw[index].to_stata('train_set_raw')
In [ ]:
print(index)
[ True  True  True ...  True  True  True]
In [ ]:
train.to_stata('train_set')
In [ ]:
train_matrix = xgb.DMatrix(train.loc[:, ~train.columns.isin(["sad"])], label=train["sad"])
test_matrix = xgb.DMatrix(test.loc[:, ~test.columns.isin(["sad"])], label=test["sad"])
In [ ]:
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 3,
    "eta": 0.1,
    "gamma": 0.5,
    "colsample_bytree": 1,
    "min_child_weight": 1,
    "subsample": 0.5,
    "seed": 123
}
watchlist = [(train_matrix, "train"), (test_matrix, "val")]
xgb_model = xgb.train(params=params, dtrain=train_matrix, num_boost_round=125, evals=watchlist,
                      early_stopping_rounds=10, verbose_eval=10)
[0]	train-logloss:0.63979	val-logloss:0.64264
[10]	train-logloss:0.56640	val-logloss:0.57160
[20]	train-logloss:0.54387	val-logloss:0.55215
[30]	train-logloss:0.53063	val-logloss:0.54359
[40]	train-logloss:0.52102	val-logloss:0.53830
[50]	train-logloss:0.51373	val-logloss:0.53539
[60]	train-logloss:0.50700	val-logloss:0.53184
[70]	train-logloss:0.50175	val-logloss:0.53095
[80]	train-logloss:0.49681	val-logloss:0.53020
[90]	train-logloss:0.49229	val-logloss:0.52977
[100]	train-logloss:0.48864	val-logloss:0.53065
In [ ]:
print(type(xgb_model))
<class 'xgboost.core.Booster'>
In [ ]:
pickle.dump(xgb_model, open("xgb.pkl", "wb"))
In [ ]:
xgb_pred_prob = xgb_model.predict(test_matrix)
xgb_pred = np.where(xgb_pred_prob > 0.5, 1, 0)
xgb_pred_factor = pd.factorize(xgb_pred)[0]
test_sad_factor = pd.factorize(test["sad"])[0]

confusion_matrix = metrics.confusion_matrix(xgb_pred_factor, test_sad_factor)
print(confusion_matrix)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot()
plt.show() 
[[2329  721]
 [ 407  904]]
No description has been provided for this image
In [ ]:
xgb_pred_prob
Out[ ]:
array([0.18503565, 0.6261598 , 0.1572482 , ..., 0.7053726 , 0.32250598,
       0.18439463], dtype=float32)
In [ ]:
test[:1]
Out[ ]:
age weight gender temperature heart_rate resp_rate spo2 sbp dbp mbp ... OTHER WHITE unknown CCU CVICU MICU MICU/SICU NICU SICU TSICU
4 76.0 77.599998 1 36.720001 59.0 21.0 97.0 107.0 90.0 94.0 ... 0 0 0 0 0 0 0 0 0 1

1 rows × 54 columns

In [ ]:
test.iloc[:,41]
Out[ ]:
4        0
6        0
11       0
15       0
21       0
        ..
14597    0
14601    0
14606    0
14609    0
14612    0
Name: AISAN, Length: 4361, dtype: uint8
In [ ]:
xgb_model.predict(xgb.DMatrix(test.drop('sad',axis=1)[:1]))
Out[ ]:
array([0.18503565], dtype=float32)
In [ ]:
 
In [ ]:
xgb.plot_importance(xgb_model, importance_type='gain', max_num_features=30)
plt.title('feature importance: gain')
plt.show()
xgb.plot_importance(xgb_model, max_num_features=30)
plt.title('feature importance: weight')
plt.show()
xgb.plot_importance(xgb_model, importance_type='cover', max_num_features=30)
plt.title('feature importance: cover')
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
data2.loc[:, ~data2.columns.isin(["sad"])]
Out[ ]:
age weight gender temperature heart_rate resp_rate spo2 sbp dbp mbp ... OTHER WHITE unknown CCU CVICU MICU MICU/SICU NICU SICU TSICU
0 44.0 79.000000 0 37.000000 100.0 28.0 98.0 107.0 66.0 75.0 ... 0 0 0 1 0 0 0 0 0 0
1 56.0 119.300003 0 36.720001 82.0 22.0 90.0 75.0 56.0 61.0 ... 0 1 0 0 0 1 0 0 0 0
2 73.0 83.500000 1 36.439999 71.0 16.0 100.0 117.0 67.0 87.0 ... 0 1 0 0 1 0 0 0 0 0
3 67.0 93.449997 0 37.220001 89.0 17.0 98.0 111.0 63.0 71.0 ... 0 0 0 0 0 0 0 0 1 0
4 76.0 77.599998 1 36.720001 59.0 21.0 97.0 107.0 90.0 94.0 ... 0 0 0 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14615 93.0 47.799999 1 35.830002 115.0 16.0 99.0 97.0 71.0 80.0 ... 0 1 0 0 0 1 0 0 0 0
14616 67.0 93.000000 0 36.439999 81.0 16.0 100.0 112.0 60.0 78.0 ... 0 1 0 0 1 0 0 0 0 0
14617 91.0 57.500000 1 35.830002 43.0 17.0 100.0 78.0 39.0 49.0 ... 0 1 0 1 0 0 0 0 0 0
14618 59.0 66.400002 0 36.389999 105.0 23.0 100.0 107.0 63.0 80.0 ... 0 1 0 0 0 1 0 0 0 0
14619 78.0 107.699997 0 36.610001 58.0 15.0 96.0 108.0 62.0 73.0 ... 0 0 0 0 1 0 0 0 0 0

14620 rows × 53 columns

In [ ]:
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(data2.loc[:, ~data2.columns.isin(["sad"])])
In [ ]:
#dit kan in scherm individuele patient
shap.force_plot(explainer.expected_value, shap_values[0, :], data2.loc[:, ~data2.columns.isin(["sad"])].iloc[0, :])
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [ ]:
# dit is niet zo nuttig maar wel gewoon heel vet
shap.force_plot(
    explainer.expected_value, shap_values[:1000, :], data2.loc[:, ~data2.columns.isin(["sad"])].iloc[:1000, :]
)
Out[ ]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [ ]:
# dit kan handig zijn voor globale uitleg over model, even kijken of dit wenselijker is dan feature importance gain van XGB
shap.summary_plot(shap_values, data2.loc[:, ~data2.columns.isin(["sad"])], plot_type="bar")
No description has been provided for this image
In [ ]:
# wel mooi deze maar een beetje convoluted
shap.summary_plot(shap_values, data2.loc[:, ~data2.columns.isin(["sad"])])
No description has been provided for this image

LIME¶

In [ ]:
X = data2.loc[:, ~data2.columns.isin(["sad"])]
X_train = train.loc[:, ~train.columns.isin(["sad"])]
explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=X.columns, mode='classification')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
File ~\anaconda3\envs\INNO\lib\site-packages\pandas\core\indexes\base.py:3802, in Index.get_loc(self, key, method, tolerance)
   3801 try:
-> 3802     return self._engine.get_loc(casted_key)
   3803 except KeyError as err:

File ~\anaconda3\envs\INNO\lib\site-packages\pandas\_libs\index.pyx:138, in pandas._libs.index.IndexEngine.get_loc()

File ~\anaconda3\envs\INNO\lib\site-packages\pandas\_libs\index.pyx:144, in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(slice(None, None, None), 0)' is an invalid key

During handling of the above exception, another exception occurred:

InvalidIndexError                         Traceback (most recent call last)
Cell In[24], line 3
      1 X = data2.loc[:, ~data2.columns.isin(["sad"])]
      2 X_train = train.loc[:, ~train.columns.isin(["sad"])]
----> 3 explainer = lime.lime_tabular.LimeTabularExplainer(X_train, feature_names=X.columns, mode='classification')

File ~\anaconda3\envs\INNO\lib\site-packages\lime\lime_tabular.py:215, in LimeTabularExplainer.__init__(self, training_data, mode, training_labels, feature_names, categorical_features, categorical_names, kernel_width, kernel, verbose, class_names, feature_selection, discretize_continuous, discretizer, sample_around_instance, random_state, training_data_stats)
    209     discretizer = StatsDiscretizer(training_data, self.categorical_features,
    210                                    self.feature_names, labels=training_labels,
    211                                    data_stats=self.training_data_stats,
    212                                    random_state=self.random_state)
    214 if discretizer == 'quartile':
--> 215     self.discretizer = QuartileDiscretizer(
    216             training_data, self.categorical_features,
    217             self.feature_names, labels=training_labels,
    218             random_state=self.random_state)
    219 elif discretizer == 'decile':
    220     self.discretizer = DecileDiscretizer(
    221             training_data, self.categorical_features,
    222             self.feature_names, labels=training_labels,
    223             random_state=self.random_state)

File ~\anaconda3\envs\INNO\lib\site-packages\lime\discretize.py:178, in QuartileDiscretizer.__init__(self, data, categorical_features, feature_names, labels, random_state)
    176 def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
--> 178     BaseDiscretizer.__init__(self, data, categorical_features,
    179                              feature_names, labels=labels,
    180                              random_state=random_state)

File ~\anaconda3\envs\INNO\lib\site-packages\lime\discretize.py:51, in BaseDiscretizer.__init__(self, data, categorical_features, feature_names, labels, random_state, data_stats)
     48 self.random_state = check_random_state(random_state)
     50 # To override when implementing a custom binning
---> 51 bins = self.bins(data, labels)
     52 bins = [np.unique(x) for x in bins]
     54 # Read the stats from data_stats if exists

File ~\anaconda3\envs\INNO\lib\site-packages\lime\discretize.py:185, in QuartileDiscretizer.bins(self, data, labels)
    183 bins = []
    184 for feature in self.to_discretize:
--> 185     qts = np.array(np.percentile(data[:, feature], [25, 50, 75]))
    186     bins.append(qts)
    187 return bins

File ~\anaconda3\envs\INNO\lib\site-packages\pandas\core\frame.py:3807, in DataFrame.__getitem__(self, key)
   3805 if self.columns.nlevels > 1:
   3806     return self._getitem_multilevel(key)
-> 3807 indexer = self.columns.get_loc(key)
   3808 if is_integer(indexer):
   3809     indexer = [indexer]

File ~\anaconda3\envs\INNO\lib\site-packages\pandas\core\indexes\base.py:3809, in Index.get_loc(self, key, method, tolerance)
   3804         raise KeyError(key) from err
   3805     except TypeError:
   3806         # If we have a listlike key, _check_indexing_error will raise
   3807         #  InvalidIndexError. Otherwise we fall through and re-raise
   3808         #  the TypeError.
-> 3809         self._check_indexing_error(key)
   3810         raise
   3812 # GH#42269

File ~\anaconda3\envs\INNO\lib\site-packages\pandas\core\indexes\base.py:5925, in Index._check_indexing_error(self, key)
   5921 def _check_indexing_error(self, key):
   5922     if not is_scalar(key):
   5923         # if key is not a scalar, directly raise an error (the code below
   5924         # would convert to numpy arrays and raise later any way) - GH29926
-> 5925         raise InvalidIndexError(key)

InvalidIndexError: (slice(None, None, None), 0)
In [ ]:
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 3,
    "eta": 0.1,
    "gamma": 0.5,
    "colsample_bytree": 1,
    "min_child_weight": 1,
    "subsample": 0.5,
    "seed": 123
}
watchlist = [(train_matrix, "train"), (test_matrix, "val")]
xgb_model = xgb.train(params=params, dtrain=train_matrix, num_boost_round=125, evals=watchlist,
                      early_stopping_rounds=10, verbose_eval=10)
In [ ]: